Use exploratory data analysis to discover as many characteristics of the dataset as needed. This includes information about the distribution of each feature, their range of values and correlations between them.
import pandas as pd
from scipy.stats import chi2_contingency
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.impute import SimpleImputer
from summarytools import dfSummary
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.subplots as sp
import pandas as pd
data = pd.read_csv('/Users/donika/Desktop/images/metadata.csv')
data
| image_id | lesion_type | confirmation | age | sex | localization | |
|---|---|---|---|---|---|---|
| 0 | IMAGE_0000244 | 0 | histo | 80.0 | male | scalp |
| 1 | IMAGE_0001916 | 0 | histo | 80.0 | male | scalp |
| 2 | IMAGE_0006461 | 0 | histo | 80.0 | male | scalp |
| 3 | IMAGE_0003197 | 0 | histo | 80.0 | male | scalp |
| 4 | IMAGE_0009907 | 0 | histo | 75.0 | male | ear |
| ... | ... | ... | ... | ... | ... | ... |
| 10010 | IMAGE_0007158 | 6 | histo | 40.0 | male | abdomen |
| 10011 | IMAGE_0005939 | 6 | histo | 40.0 | male | abdomen |
| 10012 | IMAGE_0001721 | 6 | histo | 40.0 | male | abdomen |
| 10013 | IMAGE_0008064 | 6 | histo | 80.0 | male | face |
| 10014 | IMAGE_0006942 | -1 | histo | 70.0 | female | back |
10015 rows × 6 columns
data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 10015 entries, 0 to 10014 Data columns (total 6 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 image_id 10015 non-null object 1 lesion_type 10015 non-null int64 2 confirmation 10015 non-null object 3 age 9958 non-null float64 4 sex 10015 non-null object 5 localization 10015 non-null object dtypes: float64(1), int64(1), object(4) memory usage: 469.6+ KB
data['duplicates'] = data.duplicated(subset='image_id', keep=False).map({True: 'has_duplicates', False: 'no_duplicates'})
data.duplicates.value_counts()
no_duplicates 10015 Name: duplicates, dtype: int64
data = data.drop('duplicates', axis=1)
data.reset_index(drop=True)
| image_id | lesion_type | confirmation | age | sex | localization | |
|---|---|---|---|---|---|---|
| 0 | IMAGE_0000244 | 0 | histo | 80.0 | male | scalp |
| 1 | IMAGE_0001916 | 0 | histo | 80.0 | male | scalp |
| 2 | IMAGE_0006461 | 0 | histo | 80.0 | male | scalp |
| 3 | IMAGE_0003197 | 0 | histo | 80.0 | male | scalp |
| 4 | IMAGE_0009907 | 0 | histo | 75.0 | male | ear |
| ... | ... | ... | ... | ... | ... | ... |
| 10010 | IMAGE_0007158 | 6 | histo | 40.0 | male | abdomen |
| 10011 | IMAGE_0005939 | 6 | histo | 40.0 | male | abdomen |
| 10012 | IMAGE_0001721 | 6 | histo | 40.0 | male | abdomen |
| 10013 | IMAGE_0008064 | 6 | histo | 80.0 | male | face |
| 10014 | IMAGE_0006942 | -1 | histo | 70.0 | female | back |
10015 rows × 6 columns
data.isnull().sum()
image_id 0 lesion_type 0 confirmation 0 age 57 sex 0 localization 0 dtype: int64
data.tail(5)
| image_id | lesion_type | confirmation | age | sex | localization | |
|---|---|---|---|---|---|---|
| 10010 | IMAGE_0007158 | 6 | histo | 40.0 | male | abdomen |
| 10011 | IMAGE_0005939 | 6 | histo | 40.0 | male | abdomen |
| 10012 | IMAGE_0001721 | 6 | histo | 40.0 | male | abdomen |
| 10013 | IMAGE_0008064 | 6 | histo | 80.0 | male | face |
| 10014 | IMAGE_0006942 | -1 | histo | 70.0 | female | back |
print(data[data['age'].isnull()])
image_id lesion_type confirmation age sex localization 968 IMAGE_0002572 0 consensus NaN unknown unknown 969 IMAGE_0005635 0 consensus NaN unknown unknown 970 IMAGE_0006644 0 consensus NaN unknown unknown 971 IMAGE_0008701 0 consensus NaN unknown unknown 972 IMAGE_0003426 0 consensus NaN unknown unknown 973 IMAGE_0007632 0 consensus NaN unknown unknown 974 IMAGE_0001188 -1 consensus NaN unknown unknown 975 IMAGE_0003644 0 consensus NaN unknown unknown 977 IMAGE_0007302 -1 consensus NaN unknown unknown 978 IMAGE_0008466 0 consensus NaN unknown unknown 1886 IMAGE_0006193 3 histo NaN male face 1887 IMAGE_0005905 3 histo NaN male face 7558 IMAGE_0006617 1 histo NaN female chest 7961 IMAGE_0000339 1 histo NaN male upper extremity 7962 IMAGE_0006382 1 histo NaN male upper extremity 8410 IMAGE_0003324 1 histo NaN female abdomen 8411 IMAGE_0005997 1 histo NaN female trunk 8555 IMAGE_0002249 1 histo NaN female chest 9386 IMAGE_0009300 1 consensus NaN male foot 9387 IMAGE_0001834 1 consensus NaN male foot 9593 IMAGE_0009672 1 consensus NaN unknown unknown 9594 IMAGE_0004949 1 consensus NaN unknown unknown 9595 IMAGE_0003004 1 consensus NaN unknown unknown 9596 IMAGE_0004892 1 consensus NaN unknown unknown 9597 IMAGE_0004016 1 consensus NaN unknown unknown 9598 IMAGE_0008073 1 consensus NaN unknown unknown 9599 IMAGE_0000232 1 consensus NaN unknown unknown 9600 IMAGE_0009863 1 consensus NaN unknown unknown 9601 IMAGE_0009673 1 consensus NaN unknown unknown 9602 IMAGE_0009741 1 consensus NaN unknown unknown 9603 IMAGE_0001695 1 consensus NaN unknown unknown 9604 IMAGE_0006099 1 consensus NaN unknown unknown 9605 IMAGE_0001726 1 consensus NaN unknown unknown 9606 IMAGE_0002695 1 consensus NaN unknown unknown 9607 IMAGE_0007191 1 consensus NaN unknown unknown 9608 IMAGE_0002758 -1 consensus NaN unknown unknown 9609 IMAGE_0007501 -1 consensus NaN unknown unknown 9610 IMAGE_0004688 1 consensus NaN unknown unknown 9611 IMAGE_0005706 1 consensus NaN unknown unknown 9612 IMAGE_0001426 1 consensus NaN unknown unknown 9613 IMAGE_0007223 1 consensus NaN unknown unknown 9614 IMAGE_0009278 1 consensus NaN unknown unknown 9615 IMAGE_0008546 1 consensus NaN unknown unknown 9616 IMAGE_0003172 1 consensus NaN unknown unknown 9617 IMAGE_0003201 1 consensus NaN unknown unknown 9618 IMAGE_0004139 1 consensus NaN unknown unknown 9619 IMAGE_0004086 1 consensus NaN unknown unknown 9620 IMAGE_0009780 1 consensus NaN unknown unknown 9621 IMAGE_0006578 1 consensus NaN unknown unknown 9622 IMAGE_0002726 1 consensus NaN unknown unknown 9623 IMAGE_0008295 1 consensus NaN unknown unknown 9624 IMAGE_0008000 1 consensus NaN unknown unknown 9625 IMAGE_0000736 1 consensus NaN unknown unknown 9626 IMAGE_0009753 1 consensus NaN unknown unknown 9627 IMAGE_0000167 1 consensus NaN unknown unknown 9628 IMAGE_0002730 1 consensus NaN unknown unknown 9629 IMAGE_0000536 1 consensus NaN unknown unknown
# mean_ages = data.groupby('lesion_type')['age'].mean()
# mean_ages
# def impute_age(row):
# if pd.isnull(row['age']):
# return mean_ages[row['lesion_type']]
# else:
# return row['age']
# data['Age'] = data.apply(impute_age, axis=1)
# data = data.drop('Age', axis=1)
# data.info()
df = data
x_train = df[df['age'].notna()]['lesion_type'].values.reshape(-1, 1)
y_train = df[df['age'].notna()]['age']
x_pred = df[df['age'].isna()]['lesion_type'].values.reshape(-1, 1)
model = LinearRegression()
model.fit(x_train, y_train)
predicted_ages = model.predict(x_pred)
df.loc[df['age'].isna(), 'age'] = predicted_ages
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 10015 entries, 0 to 10014 Data columns (total 6 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 image_id 10015 non-null object 1 lesion_type 10015 non-null int64 2 confirmation 10015 non-null object 3 age 10015 non-null float64 4 sex 10015 non-null object 5 localization 10015 non-null object dtypes: float64(1), int64(1), object(4) memory usage: 469.6+ KB
mapping = {0: "BKL",
1: "NV",
2: "DF",
3: "MEL",
4: "VASC",
5: "BCC",
6: "AKIEC",
-1: "Unknown",
}
df['lesion_type'] = df['lesion_type'].map(mapping)
df.lesion_type.value_counts()
NV 6029 MEL 1006 Unknown 1000 BKL 982 BCC 473 AKIEC 293 VASC 124 DF 108 Name: lesion_type, dtype: int64
dfSummary(df)
| No | Variable | Stats / Values | Freqs / (% of Valid) | Graph | Missing |
|---|---|---|---|---|---|
| 1 | image_id [object] |
1. IMAGE_0000244 2. IMAGE_0007439 3. IMAGE_0005225 4. IMAGE_0005466 5. IMAGE_0008331 6. IMAGE_0003918 7. IMAGE_0006038 8. IMAGE_0001999 9. IMAGE_0006255 10. IMAGE_0002965 11. other |
1 (0.0%) 1 (0.0%) 1 (0.0%) 1 (0.0%) 1 (0.0%) 1 (0.0%) 1 (0.0%) 1 (0.0%) 1 (0.0%) 1 (0.0%) 10,005 (99.9%) |
0 (0.0%) |
|
| 2 | lesion_type [object] |
1. NV 2. MEL 3. Unknown 4. BKL 5. BCC 6. AKIEC 7. VASC 8. DF |
6,029 (60.2%) 1,006 (10.0%) 1,000 (10.0%) 982 (9.8%) 473 (4.7%) 293 (2.9%) 124 (1.2%) 108 (1.1%) |
0 (0.0%) |
|
| 3 | confirmation [object] |
1. histo 2. follow_up 3. consensus 4. confocal |
5,340 (53.3%) 3,704 (37.0%) 902 (9.0%) 69 (0.7%) |
0 (0.0%) |
|
| 4 | age [float64] |
Mean (sd) : 51.9 (16.9) min < med < max: 0.0 < 50.0 < 85.0 IQR (CV) : 25.0 (3.1) |
22 distinct values | 0 (0.0%) |
|
| 5 | sex [object] |
1. male 2. female 3. unknown |
5,406 (54.0%) 4,552 (45.5%) 57 (0.6%) |
0 (0.0%) |
|
| 6 | localization [object] |
1. back 2. lower extremity 3. trunk 4. upper extremity 5. abdomen 6. face 7. chest 8. foot 9. unknown 10. neck 11. other |
2,192 (21.9%) 2,077 (20.7%) 1,404 (14.0%) 1,118 (11.2%) 1,022 (10.2%) 745 (7.4%) 407 (4.1%) 319 (3.2%) 234 (2.3%) 168 (1.7%) 329 (3.3%) |
0 (0.0%) |
The dataset has an imbalance, meaning some lesion types have many more samples than others.
df_dummies = pd.get_dummies(df, columns=['lesion_type'])
df_dummies.columns = [col.replace('lesion_type_', '') if i != 0 else col for i, col in enumerate(df_dummies.columns)]
df_dummies.drop("Unknown", axis=1, inplace=True)
agg_columns = {
"BKL": 'sum',
"DF": 'sum',
"MEL": 'sum',
"VASC": 'sum',
"BCC": 'sum',
"AKIEC": 'sum',
"NV": 'sum'}
age = df_dummies.pivot_table(index='age', values=list(agg_columns.keys()), aggfunc=agg_columns).reset_index()
confirmation = df_dummies.pivot_table(index='confirmation', values=list(agg_columns.keys()), aggfunc=agg_columns).reset_index()
counts = data['confirmation'].value_counts()
sorted_ = counts.index.tolist()
plt.figure(figsize=(4, 3))
sns.countplot(data=data, x='confirmation', order=sorted_)
plt.title('Diagnose type')
plt.xlabel('Gender')
plt.ylabel('Count')
plt.xticks(rotation=90)
plt.show()
confirmation
| confirmation | AKIEC | BCC | BKL | DF | MEL | NV | VASC | |
|---|---|---|---|---|---|---|---|---|
| 0 | confocal | 0 | 0 | 60 | 0 | 0 | 0 | 0 |
| 1 | consensus | 0 | 0 | 242 | 54 | 0 | 450 | 67 |
| 2 | follow_up | 0 | 0 | 0 | 0 | 0 | 3326 | 0 |
| 3 | histo | 293 | 473 | 680 | 54 | 1006 | 2253 | 57 |
The majority of diagnoses are established through histological confirmation.
age_corr=df_dummies[['age','AKIEC','BCC', 'BKL', 'DF', 'MEL', 'NV', 'VASC']]
age_corr.corr(method='kendall').head(10)
| age | AKIEC | BCC | BKL | DF | MEL | NV | VASC | |
|---|---|---|---|---|---|---|---|---|
| age | 1.000000 | 0.131514 | 0.165680 | 0.210403 | 0.004071 | 0.149270 | -0.335248 | 0.002776 |
| AKIEC | 0.131514 | 1.000000 | -0.038652 | -0.057239 | -0.018126 | -0.058012 | -0.213506 | -0.019438 |
| BCC | 0.165680 | -0.038652 | 1.000000 | -0.073409 | -0.023246 | -0.074400 | -0.273820 | -0.024929 |
| BKL | 0.210403 | -0.057239 | -0.073409 | 1.000000 | -0.034426 | -0.110179 | -0.405503 | -0.036917 |
| DF | 0.004071 | -0.018126 | -0.023246 | -0.034426 | 1.000000 | -0.034890 | -0.128409 | -0.011690 |
| MEL | 0.149270 | -0.058012 | -0.074400 | -0.110179 | -0.034890 | 1.000000 | -0.410974 | -0.037415 |
| NV | -0.335248 | -0.213506 | -0.273820 | -0.405503 | -0.128409 | -0.410974 | 1.000000 | -0.137703 |
| VASC | 0.002776 | -0.019438 | -0.024929 | -0.036917 | -0.011690 | -0.037415 | -0.137703 | 1.000000 |
The presence of one lesion does not indicate the existence of other lesions, as they arise from entirely different causal factors. As age increases, there is a general tendency for the number of lesions to rise as well.
fig = px.bar(age, x='age', y=age.columns[1:], title="Bar Plot of Lesion type associated with aging")
fig.update_layout(xaxis_title="Lesion associated with aging")
fig.update_layout(yaxis_title="Lesions")
fig.update_xaxes(range=[0, 95])
fig.update_yaxes(range=[0, 15])
fig.show()
Activate the autoscale option for the graph and select the 'Variables Color' feature to toggle the display of individual features on or off.
fig = px.line(age, x='age', y=age.columns[1:], title="Bar Plot of Lesion type associated with aging")
fig.update_layout(xaxis_title="Lesion associated with aging")
fig.update_layout(yaxis_title="Lesions")
fig.update_xaxes(range=[0, 95])
fig.update_yaxes(range=[0, 15])
fig.show()
Activate the autoscale option for the graph and select the 'Variables Color' feature to toggle the display of individual features on or off.
Lesions such as DF, VASC, BCC, AKIEC, and others tend to be more prevalent in young age.
correlations = df_dummies.corr(numeric_only=True)
age_correlations = correlations['age']
sorted_correlations = age_correlations.abs().sort_values(ascending=False).head(5)
print(sorted_correlations)
age 1.000000 NV 0.387496 BKL 0.243930 BCC 0.196724 MEL 0.173213 Name: age, dtype: float64
value = data[['lesion_type', 'localization']].value_counts().to_frame()
value.reset_index(level=[1, 0], inplace=True)
temp = value.rename(columns={'localization': 'location', 0: 'count'})
bar, ax = plt.subplots(figsize=(20, 10))
sns.barplot(x='lesion_type', y='count', hue='location', data=temp, palette="brg_r").set(title='Width = 1.0')
plt.title('Location of lesions by type', size=15)
plt.xlabel('Disease', size=10)
plt.ylabel('Count', size=10)
plt.xticks(rotation=90)
plt.show()
counts = data['lesion_type'].value_counts()
sorted_ = counts.index.tolist()
plt.figure(figsize=(7, 6))
sns.countplot(data=data, x='lesion_type', order=sorted_)
plt.title('Distribution of Lesion Types')
plt.xlabel('Lesion Type')
plt.ylabel('Count')
plt.show()
counts = data['confirmation'].value_counts()
sorted_ = counts.index.tolist()
plt.figure(figsize=(7, 6))
sns.countplot(data=data, x='confirmation',order=sorted_)
plt.title('Distribution of Confirmation Methods')
plt.xlabel('Confirmation Method')
plt.ylabel('Count')
plt.xticks(rotation=90)
plt.show()
localization_counts = data['localization'].value_counts()
sorted_localization = localization_counts.index.tolist()
plt.figure(figsize=(8, 6))
sns.countplot(data=data, x='localization', order=sorted_localization)
plt.title('Distribution of Lesion Localizations')
plt.xlabel('Localization')
plt.ylabel('Count')
plt.xticks(rotation=90)
plt.show()
counts = data['sex'].value_counts()
sorted_ = counts.index.tolist()
plt.figure(figsize=(4, 3))
sns.countplot(data=data, x='sex', order=sorted_)
plt.title('Distribution of Gender')
plt.xlabel('Gender')
plt.ylabel('Count')
plt.xticks(rotation=90)
plt.show()
In this dataset, there is a higher number of male patients compared to female patients.
value = data[['lesion_type', 'sex']].value_counts().to_frame()
value.reset_index(level=[1, 0], inplace=True)
temp = value.rename(columns={'localization': 'location', 0: 'count'})
bar, ax = plt.subplots(figsize=(20, 10))
sns.barplot(x='lesion_type', y='count', hue='sex', data=temp, palette="brg_r").set(title='Width = 1.0')
plt.title('Lesions over gender', size=15)
plt.xlabel('Disease', size=10)
plt.ylabel('Count', size=10)
plt.xticks(rotation=90)
plt.show()
female=df[df['sex']=='female']
male=df[df['sex']=='male']
temp_df = female.groupby("localization").agg({"lesion_type": 'count'}).reset_index()
fig = px.pie(temp_df, values='lesion_type', names='localization', title="Location of lesions in females",
color='lesion_type', color_discrete_sequence=px.colors.sequential.Rainbow)
fig.show()
temp_df = male.groupby("localization").agg({"lesion_type": 'count'}).reset_index()
fig = px.pie(temp_df, values='lesion_type', names='localization', title="Location of lesions in males",
color='lesion_type', color_discrete_sequence=px.colors.sequential.Rainbow)
fig.show()
The localization of lesions in females is more commonly observed in the lower extremity, whereas in males, it tends to occur more frequently in the back. This raises the question of why such gender-specific patterns in lesion localization exist?
temp_df_female = female.groupby("age").agg({"lesion_type": 'count'}).reset_index()
temp_df_male = male.groupby("age").agg({"lesion_type": 'count'}).reset_index()
fig_female = px.bar(temp_df_female, x='age', y='lesion_type', title="Location of lesions in females",
labels={'age': 'Age', 'lesion_type': 'Lesion Count'},
color='age',
color_continuous_scale='Rainbow')
fig_male = px.bar(temp_df_male, x='age', y='lesion_type', title="Location of lesions in males",
labels={'age': 'Age', 'lesion_type': 'Lesion Count'},
color='age',
color_continuous_scale='Rainbow')
fig = sp.make_subplots(rows=1, cols=2, subplot_titles=('Females', 'Males'))
for trace in fig_female.data:
fig.add_trace(trace, row=1, col=1)
for trace in fig_male.data:
fig.add_trace(trace, row=1, col=2)
fig.update_layout(title="Location of lesions in Females and Males over Age", showlegend=False)
fig.show()
The rate at which the occurrence of leisure activities decreases is slower for men compared to women.
temp = df[['localization', 'age']].value_counts().to_frame().reset_index()
temp = temp.rename(columns={'localization': 'location', 0: 'count'})
temp['age'] = temp['age'].round(2)
bar, ax = plt.subplots(figsize=(10, 10))
sns.barplot(x='location', y='count', hue='age', data=temp)
plt.title('Location of lesion over Age', size=16)
plt.xlabel('Location', size=10)
plt.ylabel('Count', size=10)
plt.xticks(rotation=90)
plt.show()
The age distribution varies significantly among the different lesion location types.
categorical_vars = ['confirmation', 'sex', 'localization']
for x in categorical_vars:
contingency_table = pd.crosstab(data['lesion_type'], data[x])
chi2, p, dof, expected = chi2_contingency(contingency_table)
print(f"Chi-square test of independence for lesion_type and {x}:")
print(f"Chi-square value = {chi2}")
print(f"p-value = {p}\n")
Chi-square test of independence for lesion_type and confirmation: Chi-square value = 4101.270254878973 p-value = 0.0 Chi-square test of independence for lesion_type and sex: Chi-square value = 107.08014205360861 p-value = 2.0563742019869162e-16 Chi-square test of independence for lesion_type and localization: Chi-square value = 2553.136892145003 p-value = 0.0
The p-values (less than 0.05) indicate a significant association between the lesion type and confirmation method, patient's sex, and lesion localization.